In [63]:
import graphlab
In [64]:
products = graphlab.SFrame('amazon_baby.gl/')
In [65]:
products.head()
Out[65]:
In [66]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [67]:
products.head()
Out[67]:
In [88]:
graphlab.canvas.set_target('ipynb')
In [69]:
products['name'].show()
In [70]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [71]:
len(giraffe_reviews)
Out[71]:
In [72]:
giraffe_reviews['rating'].show(view='Categorical')
In [73]:
products['rating'].show(view='Categorical')
In [74]:
#ignore all 3* reviews
products = products[products['rating'] != 3]
In [75]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4
In [76]:
products.head()
Out[76]:
In [77]:
train_data,test_data = products.random_split(.8, seed=0)
In [78]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [79]:
sentiment_model.evaluate(test_data)
Out[79]:
In [80]:
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[80]:
In [34]:
sentiment_model.show(view='Evaluation')
In [35]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
In [36]:
giraffe_reviews.head()
Out[36]:
In [37]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
In [38]:
giraffe_reviews.head()
Out[38]:
In [23]:
giraffe_reviews[0]['review']
Out[23]:
In [24]:
giraffe_reviews[1]['review']
Out[24]:
In [25]:
giraffe_reviews[-1]['review']
Out[25]:
In [26]:
giraffe_reviews[-2]['review']
Out[26]:
In the notebook above, we created a column ‘word_count’ with the word counts for each review. Our first task is to create a new column in the products SFrame with the counts for each selected_word above, and, in the process, we will see how the method .apply() can be used to create new columns in our data (our features) and how to use a Python function, which is an extremely useful concept to grasp!
In [15]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [81]:
def awesome_count(word_count):
if 'awesome' in word_count:
return word_count['awesome']
return 0
products['awesome'] = products['word_count'].apply(awesome_count)
def great_count(word_count):
if 'great' in word_count:
return word_count['great']
return 0
products['great'] = products['word_count'].apply(great_count)
def fantastic_count(word_count):
if 'fantastic' in word_count:
return word_count['fantastic']
return 0
products['fantastic'] = products['word_count'].apply(fantastic_count)
def amazing_count(word_count):
if 'amazing' in word_count:
return word_count['amazing']
return 0
products['amazing'] = products['word_count'].apply(amazing_count)
def love_count(word_count):
if 'love' in word_count:
return word_count['love']
return 0
products['love'] = products['word_count'].apply(love_count)
def horrible_count(word_count):
if 'horrible' in word_count:
return word_count['horrible']
return 0
products['horrible'] = products['word_count'].apply(horrible_count)
def bad_count(word_count):
if 'bad' in word_count:
return word_count['bad']
return 0
products['bad'] = products['word_count'].apply(bad_count)
def terrible_count(word_count):
if 'terrible' in word_count:
return word_count['terrible']
return 0
products['terrible'] = products['word_count'].apply(terrible_count)
def awful_count(word_count):
if 'awful' in word_count:
return word_count['awful']
return 0
products['awful'] = products['word_count'].apply(awful_count)
def wow_count(word_count):
if 'wow' in word_count:
return word_count['wow']
return 0
products['wow'] = products['word_count'].apply(wow_count)
def hate_count(word_count):
if 'hate' in word_count:
return word_count['hate']
return 0
products['hate'] = products['word_count'].apply(hate_count)
In [82]:
# products['awesome'] = products['word_count'].apply(awesome_count)
In [83]:
# # Generalize function for apply
# def selected_words_count(word_count, word):
# if word in word_count:
# return word_count[word]
# return 0
In [84]:
# for word in selected_words:
# products[word] = products.apply(lambda x: selected_words_count(x['word_count'], word))
In [85]:
products.head()
Out[85]:
In [18]:
print 'Word count value:'
for word in selected_words:
print '{0}: {1}'.format(word, products[word].sum())
# awesome: 2002
# great: 42420.0
# fantastic: 873
# amazing: 1305
# love: 40277.0
# horrible: 659
# bad: 3197
# terrible: 673
# awful: 345
# wow: 131
# hate: 1057
In [89]:
train_data,test_data = products.random_split(.8, seed=0)
In [90]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=selected_words,
validation_set=test_data)
In [98]:
coef = selected_words_model['coefficients']
In [99]:
coef = coef.sort('value', ascending=False)
coef
Out[99]:
In [100]:
coef.sort('value', ascending=True)
Out[100]:
Hint: we discussed the majority class classifier in lecture, which simply predicts that every data point is from the most common class. This is baseline is something we definitely want to beat with models we learn from data.
In [93]:
selected_words_model.evaluate(test_data)
Out[93]:
In [62]:
sentiment_model.evaluate(test_data)
Out[62]:
In [94]:
selected_words_model.evaluate(test_data, metric='roc_curve')
Out[94]:
In [95]:
selected_words_model.show(view='Evaluation')
To understand why the model with all word counts performs better than the one with only the selected_words, we will now examine the reviews for a particular product.
We will investigate a product named ‘Baby Trend Diaper Champ’. (This is a trash can for soiled baby diapers, which keeps the smell contained.)
Just like we did for the reviews for the giraffe toy in the IPython Notebook in the lecture video, before we start our analysis you should select all reviews where the product name is ‘Baby Trend Diaper Champ’. Let’s call this table diaper_champ_reviews.
Again, just as in the video, use the sentiment_model to predict the sentiment of each review in diaper_champ_reviews and sort the results according to their ‘predicted_sentiment’.
What is the ‘predicted_sentiment’ for the most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model from the IPython Notebook from lecture? Save this result to answer the quiz at the end.
Now use the selected_words_model you learned using just the selected_words to predict the sentiment most positive review you found above. Hint: if you sorted the diaper_champ_reviews in descending order (from most positive to most negative), this command will be helpful to make the prediction you need:
In [48]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
In [49]:
diaper_champ_reviews.head()
Out[49]:
In [50]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
In [51]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
In [52]:
diaper_champ_reviews.head()
Out[52]:
In [97]:
diaper_champ_reviews['predicted_sentiment'].max()
Out[97]:
In [54]:
selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')
Out[54]:
In [56]:
# diaper_champ_reviews['predicted_sentiment_2'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews.head()
Out[56]:
In [58]:
diaper_champ_reviews[0]['review']
Out[58]:
In [61]:
diaper_champ_reviews[0]['word_count']
Out[61]:
In [59]:
diaper_champ_reviews[1]['review']
Out[59]:
In [60]:
diaper_champ_reviews[-1]['review']
Out[60]: